References:
1. https://statsandr.com/blog/descriptive-statistics-in-r/
2. https://towardsdatascience.com/data-cleaning-with-r-and-the-tidyverse-detecting-missing-values-ea23c519bc62
3. https://rstudio-pubs-static.s3.amazonaws.com/3364_d1a578f521174152b46b19d0c83cbe7e.html
4. https://medium.com/data-science-in-your-pocket/various-data-distributions-in-statistics-362dc92558db
5. https://stackoverflow.com/questions/5570293/add-column-which-contains-binned-values-of-an-integer-column
library(tidyr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ dplyr 1.0.7
## ✓ tibble 3.1.5 ✓ stringr 1.4.0
## ✓ readr 2.1.0 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(summarytools)
##
## Attaching package: 'summarytools'
## The following object is masked from 'package:tibble':
##
## view
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(dlookr)
##
## Attaching package: 'dlookr'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:base':
##
## transform
data_1990_1999 <- read.csv("../data/resale-flat-prices-based-on-approval-date-1990-1999.csv", quote = "", header = TRUE, sep = ",")
data_2000_2012 <- read.csv("../data/resale-flat-prices-based-on-approval-date-2000-feb-2012.csv", header = TRUE, sep = ",")
data_2012_2014 <- read.csv("../data/resale-flat-prices-based-on-registration-date-from-mar-2012-to-dec-2014.csv", header = TRUE, sep = ",")
data_2015_2016 <- read.csv("../data/resale-flat-prices-based-on-registration-date-from-jan-2015-to-dec-2016.csv", header = TRUE, sep = ",")
data_2017 <- read.csv("../data/resale-flat-prices-based-on-registration-date-from-jan-2017-onwards.csv", header = TRUE, sep = ",")
print("data_1990_1999:")
## [1] "data_1990_1999:"
str(data_1990_1999)
## 'data.frame': 287196 obs. of 10 variables:
## $ month : chr "1990-01" "1990-01" "1990-01" "1990-01" ...
## $ town : chr "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
## $ flat_type : chr "1 ROOM" "1 ROOM" "1 ROOM" "1 ROOM" ...
## $ block : chr "309" "309" "309" "309" ...
## $ street_name : chr "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" ...
## $ storey_range : chr "10 TO 12" "04 TO 06" "10 TO 12" "07 TO 09" ...
## $ floor_area_sqm : num 31 31 31 31 73 67 67 67 67 67 ...
## $ flat_model : chr "IMPROVED" "IMPROVED" "IMPROVED" "IMPROVED" ...
## $ lease_commence_date: int 1977 1977 1977 1977 1976 1977 1977 1977 1977 1977 ...
## $ resale_price : int 9000 6000 8000 6000 47200 46000 42000 38000 40000 47000 ...
print("data_2000_2012:")
## [1] "data_2000_2012:"
str(data_2000_2012)
## 'data.frame': 369651 obs. of 10 variables:
## $ month : chr "2000-01" "2000-01" "2000-01" "2000-01" ...
## $ town : chr "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
## $ flat_type : chr "3 ROOM" "3 ROOM" "3 ROOM" "3 ROOM" ...
## $ block : chr "170" "174" "216" "215" ...
## $ street_name : chr "ANG MO KIO AVE 4" "ANG MO KIO AVE 4" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" ...
## $ storey_range : chr "07 TO 09" "04 TO 06" "07 TO 09" "07 TO 09" ...
## $ floor_area_sqm : num 69 61 73 73 67 73 73 68 68 82 ...
## $ flat_model : chr "Improved" "Improved" "New Generation" "New Generation" ...
## $ lease_commence_date: int 1986 1986 1976 1976 1976 1977 1977 1981 1981 1981 ...
## $ resale_price : num 147000 144000 159000 167000 163000 157000 178000 160000 169000 205000 ...
print("data_2015_2016:")
## [1] "data_2015_2016:"
str(data_2015_2016)
## 'data.frame': 37153 obs. of 11 variables:
## $ month : chr "2015-01" "2015-01" "2015-01" "2015-01" ...
## $ town : chr "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
## $ flat_type : chr "3 ROOM" "3 ROOM" "3 ROOM" "3 ROOM" ...
## $ block : chr "174" "541" "163" "446" ...
## $ street_name : chr "ANG MO KIO AVE 4" "ANG MO KIO AVE 10" "ANG MO KIO AVE 4" "ANG MO KIO AVE 10" ...
## $ storey_range : chr "07 TO 09" "01 TO 03" "01 TO 03" "01 TO 03" ...
## $ floor_area_sqm : num 60 68 69 68 68 67 68 68 67 68 ...
## $ flat_model : chr "Improved" "New Generation" "New Generation" "New Generation" ...
## $ lease_commence_date: int 1986 1981 1980 1979 1980 1980 1980 1981 1978 1985 ...
## $ remaining_lease : int 70 65 64 63 64 64 64 65 62 69 ...
## $ resale_price : num 255000 275000 285000 290000 290000 ...
print("data_2017:")
## [1] "data_2017:"
str(data_2017)
## 'data.frame': 115302 obs. of 11 variables:
## $ month : chr "2017-01" "2017-01" "2017-01" "2017-01" ...
## $ town : chr "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
## $ flat_type : chr "2 ROOM" "3 ROOM" "3 ROOM" "3 ROOM" ...
## $ block : chr "406" "108" "602" "465" ...
## $ street_name : chr "ANG MO KIO AVE 10" "ANG MO KIO AVE 4" "ANG MO KIO AVE 5" "ANG MO KIO AVE 10" ...
## $ storey_range : chr "10 TO 12" "01 TO 03" "01 TO 03" "04 TO 06" ...
## $ floor_area_sqm : num 44 67 67 68 67 68 68 67 68 67 ...
## $ flat_model : chr "Improved" "New Generation" "New Generation" "New Generation" ...
## $ lease_commence_date: int 1979 1978 1980 1980 1980 1981 1979 1976 1979 1979 ...
## $ remaining_lease : chr "61 years 04 months" "60 years 07 months" "62 years 05 months" "62 years 01 month" ...
## $ resale_price : num 232000 250000 262000 265000 265000 275000 280000 285000 285000 285000 ...
print("data_2012_2014:")
## [1] "data_2012_2014:"
str(data_2012_2014)
## 'data.frame': 52203 obs. of 10 variables:
## $ month : chr "2012-03" "2012-03" "2012-03" "2012-03" ...
## $ town : chr "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
## $ flat_type : chr "2 ROOM" "2 ROOM" "3 ROOM" "3 ROOM" ...
## $ block : chr "172" "510" "610" "474" ...
## $ street_name : chr "ANG MO KIO AVE 4" "ANG MO KIO AVE 8" "ANG MO KIO AVE 4" "ANG MO KIO AVE 10" ...
## $ storey_range : chr "06 TO 10" "01 TO 05" "06 TO 10" "01 TO 05" ...
## $ floor_area_sqm : num 45 44 68 67 67 68 67 67 67 67 ...
## $ flat_model : chr "Improved" "Improved" "New Generation" "New Generation" ...
## $ lease_commence_date: int 1986 1980 1980 1984 1980 1981 1978 1979 1979 1985 ...
## $ resale_price : num 250000 265000 315000 320000 321000 321000 323000 325000 328000 330000 ...
These datasets has 10 variables:
1. data_1990_1999
2. data_2000_2012
3. data_2012_2014
While the remaining datasets has 11 variables:
1. data_2015_2016
2. data_2017
The additional variable is remaining_lease, this variable only available in data from year 2015 onwards
Add new column “remaining_lease” for that 3 datasets and assign “NA”
data_1990_1999$remaining_lease <- NA
data_2000_2012$remaining_lease <- NA
data_2012_2014$remaining_lease <- NA
data <- rbind(data_1990_1999, data_2000_2012)
data <- rbind(data, data_2015_2016)
data <- rbind(data, data_2017)
data <- rbind(data, data_2012_2014)
str(data)
## 'data.frame': 861505 obs. of 11 variables:
## $ month : chr "1990-01" "1990-01" "1990-01" "1990-01" ...
## $ town : chr "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
## $ flat_type : chr "1 ROOM" "1 ROOM" "1 ROOM" "1 ROOM" ...
## $ block : chr "309" "309" "309" "309" ...
## $ street_name : chr "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" ...
## $ storey_range : chr "10 TO 12" "04 TO 06" "10 TO 12" "07 TO 09" ...
## $ floor_area_sqm : num 31 31 31 31 73 67 67 67 67 67 ...
## $ flat_model : chr "IMPROVED" "IMPROVED" "IMPROVED" "IMPROVED" ...
## $ lease_commence_date: int 1977 1977 1977 1977 1976 1977 1977 1977 1977 1977 ...
## $ resale_price : num 9000 6000 8000 6000 47200 46000 42000 38000 40000 47000 ...
## $ remaining_lease : chr NA NA NA NA ...
The merged dataset contains total 861,505 observations (flat resales transaction) and 11 variables.
summary(data)
## month town flat_type block
## Length:861505 Length:861505 Length:861505 Length:861505
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## street_name storey_range floor_area_sqm flat_model
## Length:861505 Length:861505 Min. : 28.00 Length:861505
## Class :character Class :character 1st Qu.: 73.00 Class :character
## Mode :character Mode :character Median : 93.00 Mode :character
## Mean : 95.67
## 3rd Qu.:113.00
## Max. :307.00
## lease_commence_date resale_price remaining_lease
## Min. :1966 Min. : 5000 Length:861505
## 1st Qu.:1980 1st Qu.: 185000 Class :character
## Median :1986 Median : 282000 Mode :character
## Mean :1988 Mean : 302067
## 3rd Qu.:1995 3rd Qu.: 395000
## Max. :2019 Max. :1360000
head(data)
## month town flat_type block street_name storey_range
## 1 1990-01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 2 1990-01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 04 TO 06
## 3 1990-01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 4 1990-01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 07 TO 09
## 5 1990-01 ANG MO KIO 3 ROOM 216 ANG MO KIO AVE 1 04 TO 06
## 6 1990-01 ANG MO KIO 3 ROOM 211 ANG MO KIO AVE 3 01 TO 03
## floor_area_sqm flat_model lease_commence_date resale_price
## 1 31 IMPROVED 1977 9000
## 2 31 IMPROVED 1977 6000
## 3 31 IMPROVED 1977 8000
## 4 31 IMPROVED 1977 6000
## 5 73 NEW GENERATION 1976 47200
## 6 67 NEW GENERATION 1977 46000
## remaining_lease
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
tail(data)
## month town flat_type block street_name storey_range
## 861500 2014-12 YISHUN 5 ROOM 819 YISHUN ST 81 01 TO 03
## 861501 2014-12 YISHUN 5 ROOM 816 YISHUN ST 81 10 TO 12
## 861502 2014-12 YISHUN EXECUTIVE 325 YISHUN CTRL 10 TO 12
## 861503 2014-12 YISHUN EXECUTIVE 618 YISHUN RING RD 07 TO 09
## 861504 2014-12 YISHUN EXECUTIVE 277 YISHUN ST 22 07 TO 09
## 861505 2014-12 YISHUN EXECUTIVE 277 YISHUN ST 22 04 TO 06
## floor_area_sqm flat_model lease_commence_date resale_price
## 861500 127 Improved 1987 500000
## 861501 122 Improved 1988 580000
## 861502 146 Maisonette 1988 540000
## 861503 164 Apartment 1992 738000
## 861504 152 Maisonette 1985 592000
## 861505 146 Maisonette 1985 545000
## remaining_lease
## 861500 <NA>
## 861501 <NA>
## 861502 <NA>
## 861503 <NA>
## 861504 <NA>
## 861505 <NA>
head(data$month)
## [1] "1990-01" "1990-01" "1990-01" "1990-01" "1990-01" "1990-01"
tail(data$month)
## [1] "2014-12" "2014-12" "2014-12" "2014-12" "2014-12" "2014-12"
data <- data %>% separate(month, c("year","month"), "-")
head(data)
## year month town flat_type block street_name storey_range
## 1 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 2 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 04 TO 06
## 3 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 4 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 07 TO 09
## 5 1990 01 ANG MO KIO 3 ROOM 216 ANG MO KIO AVE 1 04 TO 06
## 6 1990 01 ANG MO KIO 3 ROOM 211 ANG MO KIO AVE 3 01 TO 03
## floor_area_sqm flat_model lease_commence_date resale_price
## 1 31 IMPROVED 1977 9000
## 2 31 IMPROVED 1977 6000
## 3 31 IMPROVED 1977 8000
## 4 31 IMPROVED 1977 6000
## 5 73 NEW GENERATION 1976 47200
## 6 67 NEW GENERATION 1977 46000
## remaining_lease
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
str(data$year)
## chr [1:861505] "1990" "1990" "1990" "1990" "1990" "1990" "1990" "1990" ...
str(data$month)
## chr [1:861505] "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" "01" ...
unique(sort(data$year))
## [1] "1990" "1991" "1992" "1993" "1994" "1995" "1996" "1997" "1998" "1999"
## [11] "2000" "2001" "2002" "2003" "2004" "2005" "2006" "2007" "2008" "2009"
## [21] "2010" "2011" "2012" "2013" "2014" "2015" "2016" "2017" "2018" "2019"
## [31] "2020" "2021"
unique(sort(data$month))
## [1] "01" "02" "03" "04" "05" "06" "07" "08" "09" "10" "11" "12"
data %>% summarise(na_year = sum(is.na(year)),
na_month = sum(is.na(month)))
## na_year na_month
## 1 0 0
data$int_year <- as.integer(data$year)
data$year <- as.factor(data$year)
data$month <- as.factor(data$month)
str(data$year)
## Factor w/ 32 levels "1990","1991",..: 1 1 1 1 1 1 1 1 1 1 ...
str(data$month)
## Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
sort(summary(data$year))
## 1990 1991 1992 2014 2013 2015 1993 2016 2017 2018 2019 2011 2012
## 12505 12855 14503 16096 16097 17780 18116 19373 20509 21561 22186 22281 23198
## 2020 1994 2007 2008 1995 2006 2021 2003 2004 2005 2009 1997 2010
## 23333 26373 26982 27262 27289 27427 27713 29003 29112 30045 30482 31759 34854
## 2000 1996 2002 2001 1998 1999
## 34862 34919 36098 38055 51095 57782
sort(summary(data$month))
## 02 12 01 04 05 09 06 11 03 08 10 07
## 60812 67944 68734 69956 70016 72831 73476 73689 73997 74962 77450 77638
barplot(table(data$year), ylim=c(0,60000), las = 2)
p <- ggplot(data, aes(x = fct_infreq(year), fill = fct_infreq(year))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("year")
p + coord_flip() + scale_y_continuous(limit = c(0, 60000))
the highest transaction records was in year 1999, more than 50,000 rows out of total 861,505 rows
the lowest transaction records was in year 1990, less than 15,000 rows
there is a significant spike of transaction records in year 1998 then a significant drop in year 2000
what is the distribution type of this statistic? https://medium.com/data-science-in-your-pocket/various-data-distributions-in-statistics-362dc92558db
exploration of month by plotting
barplot(table(data$month), ylim=c(0,80000), las = 2)
p <- ggplot(data, aes(x = fct_infreq(month), fill = fct_infreq(month))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("month")
p + coord_flip() + scale_y_continuous(limit = c(0, 80000))
head(data$town)
## [1] "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO"
## [6] "ANG MO KIO"
tail(data$town)
## [1] "YISHUN" "YISHUN" "YISHUN" "YISHUN" "YISHUN" "YISHUN"
str(data$town)
## chr [1:861505] "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" "ANG MO KIO" ...
data %>% distinct(town)
## town
## 1 ANG MO KIO
## 2 BEDOK
## 3 BISHAN
## 4 BUKIT BATOK
## 5 BUKIT MERAH
## 6 BUKIT TIMAH
## 7 CENTRAL AREA
## 8 CHOA CHU KANG
## 9 CLEMENTI
## 10 GEYLANG
## 11 HOUGANG
## 12 JURONG EAST
## 13 JURONG WEST
## 14 KALLANG/WHAMPOA
## 15 MARINE PARADE
## 16 QUEENSTOWN
## 17 SENGKANG
## 18 SERANGOON
## 19 TAMPINES
## 20 TOA PAYOH
## 21 WOODLANDS
## 22 YISHUN
## 23 LIM CHU KANG
## 24 SEMBAWANG
## 25 BUKIT PANJANG
## 26 PASIR RIS
## 27 PUNGGOL
data %>% summarise(na = sum(is.na(town)))
## na
## 1 0
data$town <- as.factor(data$town)
str(data$town)
## Factor w/ 27 levels "ANG MO KIO","BEDOK",..: 1 1 1 1 1 1 1 1 1 1 ...
sort(summary(data$town))
## LIM CHU KANG BUKIT TIMAH CENTRAL AREA MARINE PARADE SEMBAWANG
## 64 2377 6650 7640 11601
## PUNGGOL BISHAN SERANGOON JURONG EAST KALLANG/WHAMPOA
## 15605 20258 21765 23566 25075
## BUKIT PANJANG GEYLANG CLEMENTI QUEENSTOWN SENGKANG
## 25568 26576 26607 27096 27234
## TOA PAYOH PASIR RIS BUKIT MERAH CHOA CHU KANG BUKIT BATOK
## 29612 31426 31912 35159 41343
## HOUGANG ANG MO KIO WOODLANDS JURONG WEST BEDOK
## 47248 49543 60667 62547 63328
## YISHUN TAMPINES
## 65336 75702
p <- ggplot(data, aes(x = fct_infreq(town), fill = fct_infreq(town))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("town")
p + coord_flip() + scale_y_continuous(limit = c(0, 80000))
# Zoom in to town with frequency < 10,000
data_filter <- data %>%
group_by(town) %>%
filter(n() < 8000)
p <- ggplot(data_filter, aes(x = fct_infreq(town), fill = fct_infreq(town))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("town")
p + coord_flip() + scale_y_continuous(limit = c(0, 8000))
TAMPINES has the highest transaction records, more than 70,000 rows out of total 861,505 rows
LIM CHU KANG has the lowest transaction records, less than 2,500 rows
both LIM CHU KANG and BUKIT TIMAH have less than 2,500 transaction records, are these outliers?
classify towns into region (based on Wei Wen’s file)
data$region <- ifelse (data$town %in% c("BUKIT MERAH","BUKIT TIMAH","GEYLANG","TOA PAYOH",
"BISHAN","KALLANG/WHAMPOA","MARINE PARADE"), "Central",
ifelse (data$town %in% c("TAMPINES","BEDOK","PASIR RIS"), "East",
ifelse (data$town %in% c("LIM CHU KANG","SEMBAWANG","WOODLANDS","YISHUN"), "North",
ifelse (data$town %in% c("ANG MO KIO","HOUGANG","PUNGGOL","SENGKANG",
"SERANGOON"), "North-East",
ifelse (data$town %in% c("BUKIT BATOK","BUKIT PANJANG",
"CHOA CHU KANG","CLEMENTI",
"JURONG EAST","JURONG WEST"), "West",
"Suburb")))))
head(data)
## year month town flat_type block street_name storey_range
## 1 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 2 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 04 TO 06
## 3 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 4 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 07 TO 09
## 5 1990 01 ANG MO KIO 3 ROOM 216 ANG MO KIO AVE 1 04 TO 06
## 6 1990 01 ANG MO KIO 3 ROOM 211 ANG MO KIO AVE 3 01 TO 03
## floor_area_sqm flat_model lease_commence_date resale_price
## 1 31 IMPROVED 1977 9000
## 2 31 IMPROVED 1977 6000
## 3 31 IMPROVED 1977 8000
## 4 31 IMPROVED 1977 6000
## 5 73 NEW GENERATION 1976 47200
## 6 67 NEW GENERATION 1977 46000
## remaining_lease int_year region
## 1 <NA> 1990 North-East
## 2 <NA> 1990 North-East
## 3 <NA> 1990 North-East
## 4 <NA> 1990 North-East
## 5 <NA> 1990 North-East
## 6 <NA> 1990 North-East
head(data$region)
## [1] "North-East" "North-East" "North-East" "North-East" "North-East"
## [6] "North-East"
tail(data$region)
## [1] "North" "North" "North" "North" "North" "North"
str(data$region)
## chr [1:861505] "North-East" "North-East" "North-East" "North-East" ...
data$region <- as.factor(data$region)
str(data$region)
## Factor w/ 6 levels "Central","East",..: 4 4 4 4 4 4 4 4 4 4 ...
sort(summary(data$region))
## Suburb North Central North-East East West
## 33746 137668 143450 161395 170456 214790
p <- ggplot(data, aes(x = fct_infreq(region), fill = fct_infreq(region))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("region")
p + coord_flip() + scale_y_continuous(limit = c(0, 250000))
head(data$flat_type)
## [1] "1 ROOM" "1 ROOM" "1 ROOM" "1 ROOM" "3 ROOM" "3 ROOM"
tail(data$flat_type)
## [1] "5 ROOM" "5 ROOM" "EXECUTIVE" "EXECUTIVE" "EXECUTIVE" "EXECUTIVE"
str(data$flat_type)
## chr [1:861505] "1 ROOM" "1 ROOM" "1 ROOM" "1 ROOM" "3 ROOM" "3 ROOM" ...
data %>% distinct(flat_type)
## flat_type
## 1 1 ROOM
## 2 3 ROOM
## 3 4 ROOM
## 4 5 ROOM
## 5 2 ROOM
## 6 EXECUTIVE
## 7 MULTI GENERATION
## 8 MULTI-GENERATION
data %>% summarise(na = sum(is.na(flat_type)))
## na
## 1 0
data <- data %>%
mutate(flat_type = replace(flat_type, flat_type == "MULTI GENERATION", "MULTI-GENERATION"))
data %>% distinct(flat_type)
## flat_type
## 1 1 ROOM
## 2 3 ROOM
## 3 4 ROOM
## 4 5 ROOM
## 5 2 ROOM
## 6 EXECUTIVE
## 7 MULTI-GENERATION
data$flat_type <- as.factor(data$flat_type)
str(data$flat_type)
## Factor w/ 7 levels "1 ROOM","2 ROOM",..: 1 1 1 1 3 3 3 3 3 3 ...
sort(summary(data$flat_type))
## MULTI-GENERATION 1 ROOM 2 ROOM EXECUTIVE
## 525 1284 10326 65436
## 5 ROOM 3 ROOM 4 ROOM
## 179795 280095 324044
p <- ggplot(data, aes(x = fct_infreq(flat_type), fill = fct_infreq(flat_type))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("flat_type")
p + coord_flip() + scale_y_continuous(limit = c(0, 350000))
# Zoom in to flat type with frequency < 5,000
data_filter <- data %>%
group_by(flat_type) %>%
filter(n() < 5000)
p <- ggplot(data_filter, aes(x = fct_infreq(flat_type), fill = fct_infreq(flat_type))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("flat_type")
p + coord_flip() + scale_y_continuous(limit = c(0, 5000))
head(data$block)
## [1] "309" "309" "309" "309" "216" "211"
tail(data$block)
## [1] "819" "816" "325" "618" "277" "277"
str(data$block)
## chr [1:861505] "309" "309" "309" "309" "216" "211" "202" "235" "235" "232" ...
data %>%
distinct(block) %>%
head()
## block
## 1 309
## 2 216
## 3 211
## 4 202
## 5 235
## 6 232
data %>%
distinct(block) %>%
tail()
## block
## 2562 512B
## 2563 512A
## 2564 509C
## 2565 511A
## 2566 511B
## 2567 512C
data %>% summarise(na = sum(is.na(block)))
## na
## 1 0
data$block <- as.factor(data$block)
str(data$block)
## Factor w/ 2567 levels "1","10","100",..: 799 799 799 799 431 414 372 503 503 498 ...
sort(summary(data$block))
## 308 302 43 409 145 412 408 231 157 35
## 1552 1555 1569 1575 1592 1599 1603 1613 1620 1622
## 131 141 217 135 419 18 206 148 232 134
## 1625 1632 1633 1651 1655 1658 1662 1671 1672 1680
## 417 406 127 219 229 214 54 146 16 19
## 1681 1690 1698 1699 1700 1705 1716 1719 1720 1727
## 411 138 403 230 15 212 204 407 118 205
## 1735 1737 1742 1759 1761 1764 1767 1771 1782 1784
## 44 142 213 202 23 28 33 208 24 123
## 1831 1844 1867 1872 1880 1895 1928 1936 1939 1954
## 17 30 129 21 32 104 126 119 11 128
## 1956 1960 1971 2051 2065 2106 2137 2139 2144 2169
## 117 121 10 9 125 211 12 122 130 20
## 2205 2222 2236 2266 2271 2274 2281 2300 2311 2327
## 210 22 13 34 124 120 106 103 116 115
## 2335 2352 2377 2448 2468 2471 2500 2676 2679 2704
## 111 105 7 112 5 109 102 6 108 114
## 2732 2755 2756 2762 2812 2844 2848 2883 2993 3047
## 3 107 113 8 4 101 110 1 2 (Other)
## 3057 3075 3149 3156 3192 3249 3263 3859 4423 648207
groupby_block <- data %>%
group_by(block) %>%
summarise(n = n()) %>%
arrange(n, decreasing = TRUE)
btm10 <- head(groupby_block,10)
top10 <- tail(groupby_block,10)
# TOP 10 block with highest transactions
data_filter <- data %>%
filter(block %in% top10$block)
p <- ggplot(data_filter, aes(x = fct_infreq(block), fill = fct_infreq(block))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("block")
p + coord_flip() + scale_y_continuous(limit = c(0, 5000))
# BOTTOM 10 block with lowest transactions
data_filter <- data %>%
filter(block %in% btm10$block)
p <- ggplot(data_filter, aes(x = fct_infreq(block), fill = fct_infreq(block))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("block")
p + coord_flip() + scale_y_continuous(limit = c(0, 10))
head(data$street_name)
## [1] "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1"
## [5] "ANG MO KIO AVE 1" "ANG MO KIO AVE 3"
tail(data$street_name)
## [1] "YISHUN ST 81" "YISHUN ST 81" "YISHUN CTRL" "YISHUN RING RD"
## [5] "YISHUN ST 22" "YISHUN ST 22"
str(data$street_name)
## chr [1:861505] "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" "ANG MO KIO AVE 1" ...
data %>%
distinct(street_name) %>%
head()
## street_name
## 1 ANG MO KIO AVE 1
## 2 ANG MO KIO AVE 3
## 3 ANG MO KIO AVE 4
## 4 ANG MO KIO AVE 10
## 5 ANG MO KIO AVE 5
## 6 ANG MO KIO AVE 8
data %>%
distinct(street_name) %>%
tail()
## street_name
## 571 TAMPINES ST 86
## 572 KEAT HONG LINK
## 573 ALJUNIED AVE 2
## 574 SUMANG LANE
## 575 CANBERRA CRES
## 576 CANBERRA ST
data %>% summarise(na = sum(is.na(street_name)))
## na
## 1 0
data$street_name <- as.factor(data$street_name)
str(data$street_name)
## Factor w/ 576 levels "ADMIRALTY DR",..: 14 14 14 14 14 17 17 17 17 17 ...
sort(summary(data$street_name))
## TAMPINES ST 33 GHIM MOH RD BEDOK NTH ST 2 TAMPINES ST 71
## 2699 2707 2721 2736
## EUNOS CRES TAMPINES ST 82 TAMPINES ST 45 JURONG EAST ST 32
## 2743 2783 2784 2821
## MARINE TER POTONG PASIR AVE 1 SERANGOON AVE 4 YISHUN ST 71
## 2823 2831 2840 2864
## PENDING RD TAMPINES ST 12 BOON LAY AVE WOODLANDS DR 50
## 2868 2887 2895 2900
## BT BATOK ST 52 JURONG WEST ST 61 CLEMENTI AVE 5 WOODLANDS DR 14
## 2904 2910 2938 2954
## YISHUN AVE 11 PUNGGOL FIELD TAMPINES ST 11 CHOA CHU KANG CTRL
## 2958 2962 2965 2980
## RIVERVALE DR JLN BT MERAH NEW UPP CHANGI RD CHOA CHU KANG AVE 3
## 2981 2984 2986 3034
## JELAPANG RD TELOK BLANGAH CRES LOR 8 TOA PAYOH C'WEALTH DR
## 3076 3081 3144 3159
## HOUGANG AVE 3 BT BATOK ST 21 JURONG EAST ST 24 BT PANJANG RING RD
## 3190 3212 3216 3226
## PETIR RD BT BATOK ST 31 TAMPINES ST 81 YISHUN ST 22
## 3233 3268 3290 3392
## WOODLANDS CIRCLE JURONG WEST ST 65 YISHUN ST 72 CLEMENTI WEST ST 2
## 3414 3472 3481 3492
## YISHUN AVE 5 PASIR RIS ST 71 SERANGOON NTH AVE 4 HOUGANG AVE 5
## 3493 3496 3501 3574
## CLEMENTI AVE 2 TEBAN GDNS RD JURONG WEST AVE 1 JURONG WEST ST 91
## 3590 3595 3614 3623
## TANGLIN HALT RD HOUGANG AVE 1 BT BATOK EAST AVE 5 TAMPINES ST 83
## 3655 3664 3689 3709
## CHOA CHU KANG CRES WOODLANDS RING RD BOON LAY PL CHOA CHU KANG AVE 4
## 3712 3759 3882 3949
## HOUGANG AVE 10 PASIR RIS ST 11 YISHUN AVE 6 SERANGOON NTH AVE 1
## 3951 4000 4092 4104
## PASIR RIS DR 6 BISHAN ST 12 JURONG WEST ST 41 TAMPINES ST 41
## 4173 4196 4215 4280
## PASIR RIS ST 21 TECK WHYE LANE WOODLANDS ST 13 YISHUN ST 81
## 4283 4308 4382 4396
## UBI AVE 1 JURONG WEST ST 52 BISHAN ST 13 YISHUN ST 61
## 4458 4463 4506 4518
## JURONG WEST ST 81 YISHUN ST 21 CLEMENTI AVE 4 YISHUN ST 11
## 4595 4690 4697 4710
## TAMPINES ST 22 ANG MO KIO AVE 1 CIRCUIT RD BT BATOK WEST AVE 6
## 4819 4824 4942 4964
## JURONG EAST ST 21 LOR 1 TOA PAYOH SIMEI ST 1 ANG MO KIO AVE 5
## 5061 5077 5298 6158
## JURONG WEST ST 42 MARSILING DR ANG MO KIO AVE 4 BEDOK NTH RD
## 6203 6360 6936 7146
## BEDOK NTH ST 3 TAMPINES ST 21 HOUGANG AVE 8 ANG MO KIO AVE 3
## 7239 7929 8877 11674
## ANG MO KIO AVE 10 BEDOK RESERVOIR RD YISHUN RING RD (Other)
## 13227 14068 16597 435780
groupby_street <- data %>%
group_by(street_name) %>%
summarise(n = n()) %>%
arrange(n, decreasing = TRUE)
btm10 <- head(groupby_street,10)
top10 <- tail(groupby_street,10)
# TOP 10 street_name with highest transactions
data_filter <- data %>%
filter(street_name %in% top10$street_name)
p <- ggplot(data_filter, aes(x = fct_infreq(street_name), fill = fct_infreq(street_name))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("street_name")
p + coord_flip() + scale_y_continuous(limit = c(0, 20000))
# BOTTOM 10 street_name with lowest transactions
data_filter <- data %>%
filter(street_name %in% btm10$street_name)
p <- ggplot(data_filter, aes(x = fct_infreq(street_name), fill = fct_infreq(street_name))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("street_name")
p + coord_flip() + scale_y_continuous(limit = c(0, 25))
head(data$storey_range)
## [1] "10 TO 12" "04 TO 06" "10 TO 12" "07 TO 09" "04 TO 06" "01 TO 03"
tail(data$storey_range)
## [1] "01 TO 03" "10 TO 12" "10 TO 12" "07 TO 09" "07 TO 09" "04 TO 06"
str(data$storey_range)
## chr [1:861505] "10 TO 12" "04 TO 06" "10 TO 12" "07 TO 09" "04 TO 06" ...
unique(sort(data$storey_range))
## [1] "01 TO 03" "01 TO 05" "04 TO 06" "06 TO 10" "07 TO 09" "10 TO 12"
## [7] "11 TO 15" "13 TO 15" "16 TO 18" "16 TO 20" "19 TO 21" "21 TO 25"
## [13] "22 TO 24" "25 TO 27" "26 TO 30" "28 TO 30" "31 TO 33" "31 TO 35"
## [19] "34 TO 36" "36 TO 40" "37 TO 39" "40 TO 42" "43 TO 45" "46 TO 48"
## [25] "49 TO 51"
data %>% summarise(na = sum(is.na(storey_range)))
## na
## 1 0
data <- data %>% separate(storey_range, c("storey_range_from","storey_range_to"), " TO ", remove = FALSE)
unique(sort(data$storey_range_from))
## [1] "01" "04" "06" "07" "10" "11" "13" "16" "19" "21" "22" "25" "26" "28" "31"
## [16] "34" "36" "37" "40" "43" "46" "49"
unique(sort(data$storey_range_to))
## [1] "03" "05" "06" "09" "10" "12" "15" "18" "20" "21" "24" "25" "27" "30" "33"
## [16] "35" "36" "39" "40" "42" "45" "48" "51"
data$storey_range_from <- as.integer(data$storey_range_from)
data$storey_range_to <- as.integer(data$storey_range_to)
#data$storey_range_mean <- pmin(data$storey_range_from, data$storey_range_to)
data$storey_range_mean <- (data$storey_range_from + data$storey_range_to) / 2
head(data)
## year month town flat_type block street_name storey_range
## 1 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 2 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 04 TO 06
## 3 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 4 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 07 TO 09
## 5 1990 01 ANG MO KIO 3 ROOM 216 ANG MO KIO AVE 1 04 TO 06
## 6 1990 01 ANG MO KIO 3 ROOM 211 ANG MO KIO AVE 3 01 TO 03
## storey_range_from storey_range_to floor_area_sqm flat_model
## 1 10 12 31 IMPROVED
## 2 4 6 31 IMPROVED
## 3 10 12 31 IMPROVED
## 4 7 9 31 IMPROVED
## 5 4 6 73 NEW GENERATION
## 6 1 3 67 NEW GENERATION
## lease_commence_date resale_price remaining_lease int_year region
## 1 1977 9000 <NA> 1990 North-East
## 2 1977 6000 <NA> 1990 North-East
## 3 1977 8000 <NA> 1990 North-East
## 4 1977 6000 <NA> 1990 North-East
## 5 1976 47200 <NA> 1990 North-East
## 6 1977 46000 <NA> 1990 North-East
## storey_range_mean
## 1 11
## 2 5
## 3 11
## 4 8
## 5 5
## 6 2
unique(sort(data$storey_range_mean))
## [1] 2 3 5 8 11 13 14 17 18 20 23 26 28 29 32 33 35 38 41 44 47 50
str(data$storey_range_from)
## int [1:861505] 10 4 10 7 4 1 7 10 4 1 ...
str(data$storey_range_to)
## int [1:861505] 12 6 12 9 6 3 9 12 6 3 ...
str(data$storey_range_mean)
## num [1:861505] 11 5 11 8 5 2 8 11 5 2 ...
data$storey_range <- as.factor(data$storey_range)
sort(summary(data$storey_range))
## 31 TO 35 36 TO 40 49 TO 51 46 TO 48 26 TO 30 43 TO 45 21 TO 25 40 TO 42
## 2 7 12 36 39 39 92 180
## 16 TO 20 37 TO 39 34 TO 36 31 TO 33 28 TO 30 11 TO 15 06 TO 10 01 TO 05
## 265 361 377 401 1241 1259 2474 2700
## 25 TO 27 22 TO 24 19 TO 21 16 TO 18 13 TO 15 10 TO 12 01 TO 03 07 TO 09
## 2847 6584 10131 21168 55716 166556 175050 196264
## 04 TO 06
## 217704
barplot(table(data$storey_range), ylim=c(0,250000), las = 2)
#barplot(table(data$storey_range_mean), ylim=c(0,250000), las = 2)
p <- ggplot(data, aes(x = fct_infreq(storey_range), fill = fct_infreq(storey_range))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("storey_range")
p + coord_flip() + scale_y_continuous(limit = c(0, 250000))
# Zoom in to storey_range with frequency < 5,000
data_filter <- data %>%
group_by(storey_range) %>%
filter(n() < 3000)
p <- ggplot(data_filter, aes(x = fct_infreq(storey_range), fill = fct_infreq(storey_range))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("storey_range")
p + coord_flip() + scale_y_continuous(limit = c(0, 3000))
barplot(table(data$storey_range_mean), ylim=c(0,250000), las = 2)
head(data$floor_area_sqm)
## [1] 31 31 31 31 73 67
tail(data$floor_area_sqm)
## [1] 127 122 146 164 152 146
str(data$floor_area_sqm)
## num [1:861505] 31 31 31 31 73 67 67 67 67 67 ...
data %>% summarise(na = sum(is.nan(floor_area_sqm)))
## na
## 1 0
summary(data$floor_area_sqm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 28.00 73.00 93.00 95.67 113.00 307.00
barplot(table(data$floor_area_sqm), ylim=c(0,70000), las = 2)
data_plot <- data
data_plot$floor_area_sqm <- as.factor(data_plot$floor_area_sqm)
p <- ggplot(data_plot, aes(x = fct_infreq(floor_area_sqm), fill = fct_infreq(floor_area_sqm))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("floor_area_sqm")
p + coord_flip() + scale_y_continuous(limit = c(0, 350000))
# Zoom in to flat type with frequency < 5,000
data_filter <- data_plot %>%
group_by(floor_area_sqm) %>%
filter(n() < 5000)
p <- ggplot(data_filter, aes(x = fct_infreq(floor_area_sqm), fill = fct_infreq(floor_area_sqm))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("floor_area_sqm")
p + coord_flip() + scale_y_continuous(limit = c(0, 5000))
summary(data$floor_area_sqm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 28.00 73.00 93.00 95.67 113.00 307.00
max(data$floor_area_sqm) - min(data$floor_area_sqm)
## [1] 279
lower_boundary <- c(seq(27, 287, by=20))
upper_boundary <- c(seq(47, 307, by=20))
lower_boundary
## [1] 27 47 67 87 107 127 147 167 187 207 227 247 267 287
upper_boundary
## [1] 47 67 87 107 127 147 167 187 207 227 247 267 287 307
bin_label <- c(paste(paste(lower_boundary, "-"),upper_boundary))
bin_label
## [1] "27 - 47" "47 - 67" "67 - 87" "87 - 107" "107 - 127" "127 - 147"
## [7] "147 - 167" "167 - 187" "187 - 207" "207 - 227" "227 - 247" "247 - 267"
## [13] "267 - 287" "287 - 307"
# range 279 divided into 14 bins - around 20 values in 1 bin
data$floor_area_sqm_bins <- cut(data$floor_area_sqm, breaks=c(seq(27, 307, by=20)), labels=bin_label)
data %>% select(floor_area_sqm, floor_area_sqm_bins) %>% head()
## floor_area_sqm floor_area_sqm_bins
## 1 31 27 - 47
## 2 31 27 - 47
## 3 31 27 - 47
## 4 31 27 - 47
## 5 73 67 - 87
## 6 67 47 - 67
str(data$floor_area_sqm_bins)
## Factor w/ 14 levels "27 - 47","47 - 67",..: 1 1 1 1 3 2 2 2 2 2 ...
summary(data$floor_area_sqm_bins)
## 27 - 47 47 - 67 67 - 87 87 - 107 107 - 127 127 - 147 147 - 167 167 - 187
## 9539 153445 179352 255087 171907 67446 21821 2216
## 187 - 207 207 - 227 227 - 247 247 - 267 267 - 287 287 - 307
## 605 31 31 18 4 3
data %>% filter(is.na(floor_area_sqm_bins)) %>% select(floor_area_sqm, floor_area_sqm_bins) %>% head()
## [1] floor_area_sqm floor_area_sqm_bins
## <0 rows> (or 0-length row.names)
barplot(table(data$floor_area_sqm_bins), ylim=c(0,300000), las = 2)
p <- ggplot(data, aes(x = fct_infreq(floor_area_sqm_bins), fill = fct_infreq(floor_area_sqm_bins))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("floor_area_sqm")
p + coord_flip() + scale_y_continuous(limit = c(0, 300000))
# Zoom in to flat type with frequency < 5,000
data_filter <- data %>%
group_by(floor_area_sqm_bins) %>%
filter(n() < 50)
p <- ggplot(data_filter, aes(x = fct_infreq(floor_area_sqm_bins), fill = fct_infreq(floor_area_sqm_bins))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("floor_area_sqm")
p + coord_flip() + scale_y_continuous(limit = c(0, 50))
head(data$flat_model)
## [1] "IMPROVED" "IMPROVED" "IMPROVED" "IMPROVED"
## [5] "NEW GENERATION" "NEW GENERATION"
tail(data$flat_model)
## [1] "Improved" "Improved" "Maisonette" "Apartment" "Maisonette"
## [6] "Maisonette"
str(data$flat_model)
## chr [1:861505] "IMPROVED" "IMPROVED" "IMPROVED" "IMPROVED" ...
unique(sort(data$flat_model))
## [1] "2-room" "2-ROOM" "Adjoined flat"
## [4] "Apartment" "APARTMENT" "DBSS"
## [7] "Improved" "IMPROVED" "Improved-Maisonette"
## [10] "IMPROVED-MAISONETTE" "Maisonette" "MAISONETTE"
## [13] "Model A" "MODEL A" "Model A-Maisonette"
## [16] "MODEL A-MAISONETTE" "Model A2" "Multi Generation"
## [19] "MULTI GENERATION" "New Generation" "NEW GENERATION"
## [22] "Premium Apartment" "PREMIUM APARTMENT" "Premium Apartment Loft"
## [25] "Premium Maisonette" "Simplified" "SIMPLIFIED"
## [28] "Standard" "STANDARD" "Terrace"
## [31] "TERRACE" "Type S1" "Type S2"
data %>% summarise(na = sum(is.na(flat_model)))
## na
## 1 0
data$flat_model <- toupper(data$flat_model)
unique(sort(data$flat_model))
## [1] "2-ROOM" "ADJOINED FLAT" "APARTMENT"
## [4] "DBSS" "IMPROVED" "IMPROVED-MAISONETTE"
## [7] "MAISONETTE" "MODEL A" "MODEL A-MAISONETTE"
## [10] "MODEL A2" "MULTI GENERATION" "NEW GENERATION"
## [13] "PREMIUM APARTMENT" "PREMIUM APARTMENT LOFT" "PREMIUM MAISONETTE"
## [16] "SIMPLIFIED" "STANDARD" "TERRACE"
## [19] "TYPE S1" "TYPE S2"
data$flat_model <- as.factor(data$flat_model)
str(data$flat_model)
## Factor w/ 20 levels "2-ROOM","ADJOINED FLAT",..: 5 5 5 5 12 12 12 12 12 12 ...
sort(summary(data$flat_model))
## 2-ROOM PREMIUM APARTMENT LOFT PREMIUM MAISONETTE
## 52 77 83
## IMPROVED-MAISONETTE TYPE S2 TYPE S1
## 118 175 350
## MULTI GENERATION TERRACE ADJOINED FLAT
## 525 668 1151
## MODEL A-MAISONETTE DBSS MODEL A2
## 1959 2469 9433
## MAISONETTE APARTMENT PREMIUM APARTMENT
## 28013 33379 39555
## STANDARD SIMPLIFIED NEW GENERATION
## 40863 55122 181433
## IMPROVED MODEL A
## 226103 239977
p <- ggplot(data, aes(x = fct_infreq(flat_model), fill = fct_infreq(flat_model))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("flat_model")
p + coord_flip() + scale_y_continuous(limit = c(0, 250000))
# Zoom in to flat_model with frequency < 5,000
data_filter <- data %>%
group_by(flat_model) %>%
filter(n() < 3000)
p <- ggplot(data_filter, aes(x = fct_infreq(flat_model), fill = fct_infreq(flat_model))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("flat_model")
p + coord_flip() + scale_y_continuous(limit = c(0, 3000))
data$new_flat_model <- ifelse (data$flat_model %in% c("2-ROOM","APARTMENT","ADJOINED FLAT","DBSS",
"IMPROVED","MODEL A","MODEL A2"), "Flat",
ifelse (data$flat_model %in% c("MAISONETTE","IMPROVED-MAISONETTE",
"MODEL A-MAISONETTE"), "Maisonette",
ifelse (grepl("PREMIUM", data$flat_model), "Premium",
ifelse (data$flat_model %in% c("TERRACE","TYPE S1","TYPE S2"), "Terrace",
"Others"))))
head(data)
## year month town flat_type block street_name storey_range
## 1 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 2 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 04 TO 06
## 3 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 10 TO 12
## 4 1990 01 ANG MO KIO 1 ROOM 309 ANG MO KIO AVE 1 07 TO 09
## 5 1990 01 ANG MO KIO 3 ROOM 216 ANG MO KIO AVE 1 04 TO 06
## 6 1990 01 ANG MO KIO 3 ROOM 211 ANG MO KIO AVE 3 01 TO 03
## storey_range_from storey_range_to floor_area_sqm flat_model
## 1 10 12 31 IMPROVED
## 2 4 6 31 IMPROVED
## 3 10 12 31 IMPROVED
## 4 7 9 31 IMPROVED
## 5 4 6 73 NEW GENERATION
## 6 1 3 67 NEW GENERATION
## lease_commence_date resale_price remaining_lease int_year region
## 1 1977 9000 <NA> 1990 North-East
## 2 1977 6000 <NA> 1990 North-East
## 3 1977 8000 <NA> 1990 North-East
## 4 1977 6000 <NA> 1990 North-East
## 5 1976 47200 <NA> 1990 North-East
## 6 1977 46000 <NA> 1990 North-East
## storey_range_mean floor_area_sqm_bins new_flat_model
## 1 11 27 - 47 Flat
## 2 5 27 - 47 Flat
## 3 11 27 - 47 Flat
## 4 8 27 - 47 Flat
## 5 5 67 - 87 Others
## 6 2 47 - 67 Others
head(data$new_flat_model)
## [1] "Flat" "Flat" "Flat" "Flat" "Others" "Others"
tail(data$new_flat_model)
## [1] "Flat" "Flat" "Maisonette" "Flat" "Maisonette"
## [6] "Maisonette"
str(data$new_flat_model)
## chr [1:861505] "Flat" "Flat" "Flat" "Flat" "Others" "Others" "Others" ...
data$new_flat_model <- as.factor(data$new_flat_model)
str(data$new_flat_model)
## Factor w/ 5 levels "Flat","Maisonette",..: 1 1 1 1 3 3 3 3 3 3 ...
sort(summary(data$new_flat_model))
## Terrace Maisonette Premium Others Flat
## 1193 30090 39715 277943 512564
p <- ggplot(data, aes(x = fct_infreq(new_flat_model), fill = fct_infreq(new_flat_model))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("new_flat_model")
p + coord_flip() + scale_y_continuous(limit = c(0, 550000))
# Zoom in to new_flat_model with frequency < 5,000
data_filter <- data %>%
group_by(new_flat_model) %>%
filter(n() < 5000)
p <- ggplot(data_filter, aes(x = fct_infreq(new_flat_model), fill = fct_infreq(new_flat_model))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("new_flat_model")
p + coord_flip() + scale_y_continuous(limit = c(0, 5000))
head(data$lease_commence_date)
## [1] 1977 1977 1977 1977 1976 1977
tail(data$lease_commence_date)
## [1] 1987 1988 1988 1992 1985 1985
str(data$lease_commence_date)
## int [1:861505] 1977 1977 1977 1977 1976 1977 1977 1977 1977 1977 ...
unique(sort(data$lease_commence_date))
## [1] 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980
## [16] 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
## [31] 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010
## [46] 2011 2012 2013 2014 2015 2016 2017 2018 2019
data %>% summarise(na = sum(is.na(lease_commence_date)))
## na
## 1 0
summary(data$lease_commence_date)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1966 1980 1986 1988 1995 2019
# check whether remaining lease is calculated correctly
data %>%
filter(!is.na(remaining_lease) && remaining_lease != (99 - (data$int_year - data$lease_commence_date))) %>%
select(remaining_lease)
## [1] remaining_lease
## <0 rows> (or 0-length row.names)
head(data$resale_price)
## [1] 9000 6000 8000 6000 47200 46000
tail(data$resale_price)
## [1] 500000 580000 540000 738000 592000 545000
str(data$resale_price)
## num [1:861505] 9000 6000 8000 6000 47200 46000 42000 38000 40000 47000 ...
data %>% summarise(na = sum(is.na(resale_price)))
## na
## 1 0
summary(data$resale_price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5000 185000 282000 302067 395000 1360000
barplot(table(data$resale_price), ylim=c(0,8000), las = 2)
summary(data$resale_price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5000 185000 282000 302067 395000 1360000
max(data$resale_price) - min(data$resale_price)
## [1] 1355000
lower_boundary <- c(seq(4000, 1224400, by=135600))
upper_boundary <- c(seq(139600, 1360000, by=135600))
lower_boundary
## [1] 4000 139600 275200 410800 546400 682000 817600 953200 1088800
## [10] 1224400
upper_boundary
## [1] 139600 275200 410800 546400 682000 817600 953200 1088800 1224400
## [10] 1360000
bin_label <- c(paste(paste(lower_boundary, "-"),upper_boundary))
bin_label
## [1] "4000 - 139600" "139600 - 275200" "275200 - 410800"
## [4] "410800 - 546400" "546400 - 682000" "682000 - 817600"
## [7] "817600 - 953200" "953200 - 1088800" "1088800 - 1224400"
## [10] "1224400 - 1360000"
data$resale_price_bins <- cut(data$resale_price, breaks=c(seq(4000, 1360000, by=135600)), labels=bin_label)
data %>% select(resale_price, resale_price_bins) %>% head()
## resale_price resale_price_bins
## 1 9000 4000 - 139600
## 2 6000 4000 - 139600
## 3 8000 4000 - 139600
## 4 6000 4000 - 139600
## 5 47200 4000 - 139600
## 6 46000 4000 - 139600
str(data$resale_price_bins)
## Factor w/ 10 levels "4000 - 139600",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(data$resale_price_bins)
## 4000 - 139600 139600 - 275200 275200 - 410800 410800 - 546400
## 105819 311311 257074 129253
## 546400 - 682000 682000 - 817600 817600 - 953200 953200 - 1088800
## 39188 13228 4617 843
## 1088800 - 1224400 1224400 - 1360000
## 149 23
data %>% filter(is.na(resale_price_bins)) %>% select(resale_price, resale_price_bins) %>% head()
## [1] resale_price resale_price_bins
## <0 rows> (or 0-length row.names)
barplot(table(data$resale_price_bins), ylim=c(0,350000), las = 2)
p <- ggplot(data, aes(x = fct_infreq(resale_price_bins), fill = fct_infreq(resale_price_bins))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("resale_price")
p + coord_flip() + scale_y_continuous(limit = c(0, 350000))
# Zoom in to resale_price with frequency < 5,000
data_filter <- data %>%
group_by(resale_price_bins) %>%
filter(n() < 5000)
p <- ggplot(data_filter, aes(x = fct_infreq(resale_price_bins), fill = fct_infreq(resale_price_bins))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("resale_price")
p + coord_flip() + scale_y_continuous(limit = c(0, 5000))
Resale price ranges from 139600-275200 has the highest transaction records, more than 300,000 rows out of total 861,505 rows
Resale price ranges from 1224400-1360000 has the lowest transaction records, less than 500 rows
any outliers?
boxplot(data$resale_price ~ data$year)
boxplot(data$resale_price ~ data$region)
boxplot(data$resale_price ~ data$floor_area_sqm_bins)
plot(data$region ~ data$resale_price)
library(ggplot2)
ggplot(data) +
aes(x = resale_price, y = floor_area_sqm_bins, colour = region) +
geom_point() +
scale_color_hue()
head(data$remaining_lease)
## [1] NA NA NA NA NA NA
tail(data$remaining_lease)
## [1] NA NA NA NA NA NA
str(data$remaining_lease)
## chr [1:861505] NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA ...
data %>% summarise(na = sum(is.na(remaining_lease)))
## na
## 1 709050
summary(data$remaining_lease)
## Length Class Mode
## 861505 character character
data_test <- data
data_test$used_lease <- data_test$int_year - data_test$lease_commence_date
data_test$remaining_lease <- as.integer(99 - data_test$used_lease)
str(data_test$remaining_lease)
## int [1:861505] 86 86 86 86 85 86 86 86 86 86 ...
data_test %>% summarise(na = sum(is.na(remaining_lease)))
## na
## 1 0
summary(data_test$remaining_lease)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 44.00 75.00 83.00 81.65 90.00 101.00
data_test %>% filter(remaining_lease > 99) %>% select (int_year, used_lease, lease_commence_date, remaining_lease)
## int_year used_lease lease_commence_date remaining_lease
## 1 1991 -1 1992 100
## 2 1992 -1 1993 100
## 3 1993 -1 1994 100
## 4 1994 -1 1995 100
## 5 1994 -1 1995 100
## 6 1994 -2 1996 101
## 7 1994 -1 1995 100
## 8 1994 -1 1995 100
## 9 1994 -1 1995 100
## 10 1994 -1 1995 100
## 11 1994 -1 1995 100
## 12 1994 -1 1995 100
## 13 1994 -1 1995 100
## 14 1994 -1 1995 100
## 15 1994 -1 1995 100
## 16 1994 -1 1995 100
## 17 1994 -1 1995 100
## 18 1994 -1 1995 100
## 19 1994 -1 1995 100
## 20 1994 -1 1995 100
## 21 1994 -2 1996 101
## 22 1995 -1 1996 100
## 23 1995 -1 1996 100
## 24 1995 -1 1996 100
## 25 1995 -1 1996 100
## 26 1995 -1 1996 100
## 27 1995 -1 1996 100
## 28 1995 -1 1996 100
## 29 1995 -1 1996 100
## 30 1995 -1 1996 100
## 31 1995 -1 1996 100
## 32 1995 -1 1996 100
## 33 1995 -1 1996 100
## 34 1996 -1 1997 100
## 35 1996 -1 1997 100
## 36 1996 -1 1997 100
## 37 2004 -2 2006 101
## 38 2004 -2 2006 101
## 39 2005 -1 2006 100
## 40 2005 -1 2006 100
## 41 2005 -1 2006 100
## 42 2005 -1 2006 100
## 43 2005 -1 2006 100
## 44 2005 -1 2006 100
## 45 2005 -1 2006 100
## 46 2005 -1 2006 100
## 47 2005 -1 2006 100
## 48 2005 -1 2006 100
## 49 2005 -1 2006 100
## 50 2005 -1 2006 100
## 51 2011 -1 2012 100
data_test %>% summarise(count = sum(remaining_lease > 99))
## count
## 1 51
# update remaining lease to the actual df
data$remaining_lease <- data_test$remaining_lease
# drop these 51 records
data <- subset(data,remaining_lease <= 99)
summary(data$remaining_lease)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 44.00 75.00 83.00 81.65 90.00 99.00
str(data$remaining_lease)
## int [1:861454] 86 86 86 86 85 86 86 86 86 86 ...
barplot(table(data$remaining_lease), ylim=c(0,50000), las = 2)
data_plot <- data
data_plot$remaining_lease <- as.factor(data_plot$remaining_lease)
p <- ggplot(data_plot, aes(x = fct_infreq(remaining_lease), fill = fct_infreq(remaining_lease))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("remaining_lease")
p + coord_flip() + scale_y_continuous(limit = c(0, 50000))
# Zoom in to remaining_lease with frequency < 5,000
data_filter <- data_plot %>%
group_by(remaining_lease) %>%
filter(n() < 1000)
p <- ggplot(data_filter, aes(x = fct_infreq(remaining_lease), fill = fct_infreq(remaining_lease))) +
geom_bar(width = 1, colour = "black", show.legend = FALSE) +
xlab("remaining_lease")
p + coord_flip() + scale_y_continuous(limit = c(0, 1000))
data <- data[,-which(names(data) %in% c("lease_commence_date","int_year"))]
summary(data)
## year month town
## 1999 : 57782 07 : 77636 TAMPINES : 75695
## 1998 : 51095 10 : 77447 YISHUN : 65336
## 2001 : 38055 08 : 74958 BEDOK : 63327
## 2002 : 36098 03 : 73996 JURONG WEST: 62545
## 1996 : 34916 11 : 73685 WOODLANDS : 60661
## 2000 : 34862 06 : 73470 ANG MO KIO : 49537
## (Other):608646 (Other):410262 (Other) :484353
## flat_type block street_name
## 1 ROOM : 1284 2 : 4423 YISHUN RING RD : 16597
## 2 ROOM : 10326 1 : 3859 BEDOK RESERVOIR RD: 14068
## 3 ROOM :280095 110 : 3263 ANG MO KIO AVE 10 : 13227
## 4 ROOM :324030 101 : 3249 ANG MO KIO AVE 3 : 11674
## 5 ROOM :179771 4 : 3192 HOUGANG AVE 8 : 8877
## EXECUTIVE : 65423 8 : 3156 TAMPINES ST 21 : 7929
## MULTI-GENERATION: 525 (Other):840312 (Other) :789082
## storey_range storey_range_from storey_range_to floor_area_sqm
## 04 TO 06:217692 Min. : 1.000 Min. : 3.000 Min. : 28.00
## 07 TO 09:196254 1st Qu.: 4.000 1st Qu.: 6.000 1st Qu.: 73.00
## 01 TO 03:175041 Median : 7.000 Median : 9.000 Median : 93.00
## 10 TO 12:166548 Mean : 6.593 Mean : 8.609 Mean : 95.67
## 13 TO 15: 55714 3rd Qu.:10.000 3rd Qu.:12.000 3rd Qu.:113.00
## 16 TO 18: 21167 Max. :49.000 Max. :51.000 Max. :307.00
## (Other) : 29038
## flat_model resale_price remaining_lease
## MODEL A :239958 Min. : 5000 Min. :44.00
## IMPROVED :226084 1st Qu.: 185000 1st Qu.:75.00
## NEW GENERATION :181433 Median : 282000 Median :83.00
## SIMPLIFIED : 55122 Mean : 302064 Mean :81.65
## STANDARD : 40863 3rd Qu.: 395000 3rd Qu.:90.00
## PREMIUM APARTMENT: 39555 Max. :1360000 Max. :99.00
## (Other) : 78439
## region storey_range_mean floor_area_sqm_bins new_flat_model
## Central :143439 Min. : 2.000 87 - 107 :255075 Flat :512514
## East :170437 1st Qu.: 5.000 67 - 87 :179352 Maisonette: 30089
## North :137662 Median : 8.000 107 - 127:171890 Others :277943
## North-East:161389 Mean : 7.601 47 - 67 :153445 Premium : 39715
## Suburb : 33743 3rd Qu.:11.000 127 - 147: 67431 Terrace : 1193
## West :214784 Max. :50.000 147 - 167: 21816
## (Other) : 12445
## resale_price_bins
## 139600 - 275200:311302
## 275200 - 410800:257045
## 410800 - 546400:129244
## 4000 - 139600 :105818
## 546400 - 682000: 39185
## 682000 - 817600: 13228
## (Other) : 5632
str(data)
## 'data.frame': 861454 obs. of 18 variables:
## $ year : Factor w/ 32 levels "1990","1991",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ month : Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ town : Factor w/ 27 levels "ANG MO KIO","BEDOK",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ flat_type : Factor w/ 7 levels "1 ROOM","2 ROOM",..: 1 1 1 1 3 3 3 3 3 3 ...
## $ block : Factor w/ 2567 levels "1","10","100",..: 799 799 799 799 431 414 372 503 503 498 ...
## $ street_name : Factor w/ 576 levels "ADMIRALTY DR",..: 14 14 14 14 14 17 17 17 17 17 ...
## $ storey_range : Factor w/ 25 levels "01 TO 03","01 TO 05",..: 6 3 6 5 3 1 5 6 3 1 ...
## $ storey_range_from : int 10 4 10 7 4 1 7 10 4 1 ...
## $ storey_range_to : int 12 6 12 9 6 3 9 12 6 3 ...
## $ floor_area_sqm : num 31 31 31 31 73 67 67 67 67 67 ...
## $ flat_model : Factor w/ 20 levels "2-ROOM","ADJOINED FLAT",..: 5 5 5 5 12 12 12 12 12 12 ...
## $ resale_price : num 9000 6000 8000 6000 47200 46000 42000 38000 40000 47000 ...
## $ remaining_lease : int 86 86 86 86 85 86 86 86 86 86 ...
## $ region : Factor w/ 6 levels "Central","East",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ storey_range_mean : num 11 5 11 8 5 2 8 11 5 2 ...
## $ floor_area_sqm_bins: Factor w/ 14 levels "27 - 47","47 - 67",..: 1 1 1 1 3 2 2 2 2 2 ...
## $ new_flat_model : Factor w/ 5 levels "Flat","Maisonette",..: 1 1 1 1 3 3 3 3 3 3 ...
## $ resale_price_bins : Factor w/ 10 levels "4000 - 139600",..: 1 1 1 1 1 1 1 1 1 1 ...
write.csv(data,"../data/data_clean.csv", row.names = FALSE)